#include <malloc.h>
#include "hs/ppu.h"
#include "hs/type.h"
#include "bus.h"
#include "util.h"
#include "cpu.h"

#define VBL_ENABLE(ppu) \
        (ppu->control&0x80)==0x80 \

#define RENDER_ENABLE(mask) \
            ((mask&0x18)!=0)        \


static const uint8 MIRROR_LOOK_TABLE[][4] = {
        {0, 0, 1, 1},
        {0, 1, 0, 1},
        {0, 0, 0, 0},
        {1, 1, 1, 1},
        {0, 1, 2, 3}
};

static inline uint16 ppu_vram_mirror(Mirroring mirroring, uint16 addr) {
    addr = (addr - 0x2000) % 0x1000;
    uint16 table = addr / 0x0400;
    uint16 offset = addr % 0x0400;
    addr = MIRROR_LOOK_TABLE[mirroring][table] * 0x0400 + offset;
    return addr;
}

static inline uint16 ppu_palette_mirror(uint16 addr) {
    if (addr == 0x3f0 || addr == 0x3f14 || addr == 0x3f18 || addr == 0x3f1c) {
        addr = addr - 0x10;
    }
    return addr % 0x20;
}

static inline void ppu_vram_increment(PPU *ppu) {
    uint8 c = ppu->control;
    uint8 offset = ((c >> 1) & 0x01) * 31 + 1;
    uint16 addr = ppu->v + offset;
    ppu->v = addr;
}

static inline void ppu_read_tile_idx(PPU *ppu, Mirroring mirroring) {
    uint16 v = ppu->v;
    uint16 addr = 0x2000 | (v & 0x0FFF);
    addr = ppu_vram_mirror(mirroring, addr);
    ppu->tileIdx = ppu->vram[addr];
}

static inline void ppu_read_tile_attr(PPU *ppu, Mirroring mirroring) {
    uint16 v = ppu->v;
    uint16 addr = 0x23c0 | (v & 0x0c00) | (v >> 4) & 0x38 | (v >> 2) & 0x07;
    ppu->tileAttr = ppu->vram[ppu_vram_mirror(mirroring, addr)];
}

static inline void ppu_read_tile(NesConsole *console, PPU *ppu, bool upper) {
    uint8 fineY = (ppu->v >> 12) & 0x07;
    uint16 table = (ppu->control & 0x1F) << 8;
    uint16 addr = table + (ppu->tileIdx * 16) + fineY;
    uint8 b = console->cartridge->ines_chr_read(console, addr);
    if (upper) {
        ppu->upperTile = b;
    } else {
        ppu->lowerTile = b;
    }
}


static inline void ppu_vx_inc(PPU *ppu) {
    uint16 v = ppu->v;
    if ((v & 0x1f) == 31) {
        //coarse x=0
        v &= ~0x001F;
        //Switch horizontal name table
        // 00 ^ 01 = 01   (name table 0 -> 1)
        // 01 ^ 01 = 00   (name table 1 -> 0)
        // 10 ^ 01 = 11   (name table 2 -> 3)
        // 11 ^ 01 = 10   (name table 3 -> 2)
        v ^= 0x0400;
    } else {
        //Increase coarse x
        v++;
    }
    ppu->v = v;
}

static inline void ppu_vy_inc(PPU *ppu) {
    uint16 v = ppu->v;
    // if fine Y < 7
    if ((v & 0x7000) != 0x7000) {
        // increment fine Y
        v += 0x1000;
    } else {
        // fine Y = 0
        v &= ~0x7000;
        // let y = coarse Y
        uint8 y = (v & 0x03E0) >> 5;
        // Row 29 is the last row of tiles in a nametable.
        // To wrap to the next nametable when incrementing coarse Y from 29,
        // the vertical nametable is switched by toggling bit 11, and coarse Y wraps to row 0.
        if (y == 29) {
            // coarse Y = 0
            y = 0;
            // switch vertical nametable
            // 10 ^ 10 = 00     (name table 2 -> 0)
            // 00 ^ 10 = 10     (name table 0 -> 2)
            // 01 ^ 10 = 11     (name table 1 -> 3)
            // 11 ^ 10 = 01     (name table 3 -> 1)
            v ^= 0x0800;
        }
            // Coarse Y can be set out of bounds (> 29), which will cause the PPU to read the attribute
            // data stored there as tile data. If coarse Y is incremented from 31, it will wrap to 0,
            // but the nametable will not switch. For this reason, a write >= 240 to $2005 may appear
            // as a "negative" scroll value, where 1 or 2 rows of attribute data will appear before the
            // nametable's tile data is reached. (Some games use this to move the top of the nametable
            // out of the Overscan area.)
        else if (y == 31) {
            // coarse Y = 0, nametable not switched
            y = 0;
        } else {
            // increment coarse Y
            y++;
        }

        // put coarse Y back into v
        v = ((v & ~0x03E0) | y << 5);
    }
    ppu->v = v;
}

static inline void ppu_background_mixer(PPU *ppu, uint16 cycle, bool preFetch) {
    uint32 latch = ppu->latchX32;

    if (!preFetch) {
        uint16 v = ppu->v;
        uint16 x0 = cycle - 8;
        uint8 mask = ppu->mask;
        uint8 coarseX = (v & 0x1F);
        uint8 coarseY = (v >> 5) & 0x1F;
        uint8 x = (coarseX % 4) / 2;
        uint8 y = (coarseY % 4) / 2;
        uint8 s = x << 1 | y << 2;
        uint16 y0 = ppu->scanline;
        uint8 *palette = ppu->palette;
        uint8 showBackground = mask & 0x08;
        uint8 *pixelBuf = ppu->pixelBuffer;
        uint8 idx = 1 + ((ppu->tileAttr >> s) & 0x03) * 4;
        for (int i = 0; i < 8; ++i) {
            uint8 pixelIdx = palette[0];
            bool lower = (latch >> (7 - i)) & 0x01;
            bool upper = (latch >> (15 - i)) & 0x01;
            uint8 k = (lower | upper << 1);
            bool showBackgroundLeftMost = (x0 > 7) || (mask & 0x02);
            if (showBackground && showBackgroundLeftMost && k > 0) {
                pixelIdx = (k == 1) ? idx : (k == 2 ? idx + 1 : idx + 2);
            }
            pixelBuf[y0 * PPU_VIDEO_WIDTH + x0 + i] = palette[pixelIdx];
        }
    }

    latch >>= 16;
    latch = (latch & 0x00FFFFFF) | ppu->upperTile << 24;
    latch = (latch & 0xFF00FFFF) | ppu->lowerTile << 16;
    ppu->latchX32 = latch;

    ppu_vx_inc(ppu);
}

extern uint8 ines_ppu_status(NesConsole *console) {
    PPU *ppu = console->ppu;
    ppu->w = 0;
    uint8 p = ppu->status;
    // Reading the status register will clear bit 7 mentioned above and also the address latch used by
    // PPUSCROLL and PPUADDR. It does not clear the sprite 0 hit or overflow bit.
    ppu->status = p & (0x7F);
    return p;
}

extern void ines_ppu_mask(NesConsole *console, uint8 b) {
    PPU *ppu = console->ppu;
    ppu->mask = b;
}

extern void ines_ppu_control(NesConsole *console, uint8 b) {
    PPU *ppu = console->ppu;
    // t: ...GH.. ........ <- d: ......GH
    //   <used elsewhere> <- d: ABCDEF..
    ppu->t = (ppu->t & 0xF3FF) | (b & 0x03) << 10;
    ppu->control = b;
}

extern void ines_ppu_dma(NesConsole *console, uint8 b) {
    uint16 addr = (b << 8);
    PPU *ppu = console->ppu;
    uint8 oamAddr = ppu->oamAddr;
    for (int i = 0; i < 0x100; ++i) {
        ppu->primaryOam[oamAddr] = ines_bus_read(console, addr + i);
        oamAddr++;
    }
    ppu->oamAddr = oamAddr;
    console->cpu->suspend += 512;
}


extern PPU *ines_ppu_new() {
    PPU *ppu = malloc(sizeof(PPU));

    ppu->mask = 0;
    ppu->cycle = 0;
    ppu->status = 0;
    ppu->readBuf = 0;
    ppu->control = 0;
    ppu->scanline = 240;
    ppu->t = ppu->v = 0;
    ppu->oddFrame = False;
    ppu->vram = ines_new_array(0x800);
    ppu->palette = ines_new_array(0x20);
    ppu->primaryOam = ines_new_array(0x100);
    ppu->secondaryOam = ines_new_array(0x20);
    ppu->pixelBuffer = ines_new_array(PPU_VIDEO_WIDTH * PPU_VIDEO_HEIGHT);
    return ppu;
}

extern void ines_ppu_scroll(NesConsole *console, uint8 b) {
    PPU *ppu = console->ppu;
    uint8 w = ppu->w;
    if (w == 0) {
        //w:                  <- 1
        w = 1;
        //x:              FGH <- d: .....FGH
        ppu->x = (ppu->x) & (b & 0x07);
        //t: ....... ...ABCDE <- d: ABCDE...
        ppu->t = (ppu->t & 0xFFE0) | (b >> 3);
    } else {
        //w:                  <- 0
        w = 0;
        //t: FGH..AB CDE..... <- d: ABCDEFGH
        ppu->t = (ppu->t & 0x8FFF) | ((b & 0x07) << 12);
        ppu->t = (ppu->t & 0xFC1F) | ((b & 0xF8) << 2);
    }
    ppu->w = w;
}

extern void ines_ppu_addr(NesConsole *console, uint8 b) {
    PPU *ppu = console->ppu;
    uint8 w = ppu->w;
    if (w == 0) {
        //w:                  <- 1
        w = 1;
        //t: .CDEFGH ........ <- d: ..CDEFGH
        ppu->t = (ppu->t & 0x80ff) | ((b & 0x3f) << 8);
    } else {
        //w:                  <- 0
        w = 0;
        //t: ....... ABCDEFGH <- d: ABCDEFGH
        //v: <...all bits...> <- t: <...all bits...>
        ppu->v = ppu->t = (ppu->t & 0xff00) | b;
    }
    ppu->w = w;
}

extern void ines_ppu_dispose(PPU *ppu) {
    if (ppu == NULL) {
        return;
    }
    free(ppu->vram);
    free(ppu->palette);
    free(ppu->primaryOam);
    free(ppu->pixelBuffer);
    free(ppu->secondaryOam);
    free(ppu);
}


extern void ines_ppu_run(NesConsole *console, int masterCycle) {
    PPU *ppu = console->ppu;

    Mirroring mirroring = console->mirroring;

    uint8 mask = ppu->mask;
    uint16 cycle = ppu->cycle;
    uint8 control = ppu->control;
    uint8 *oam = ppu->primaryOam;
    uint16 scanline = ppu->scanline;
    uint8 *secondaryOam = ppu->secondaryOam;

    int tmp = masterCycle * 3;

    for (; tmp > 0; tmp--) {
        cycle = (cycle + 1) % 342;
        ppu->cycle = cycle;
        if (cycle == 0) {
            scanline = (scanline + 1) % 262;
            ppu->scanline = scanline;
            ppu->oddFrame = !ppu->oddFrame;
        }
        bool preLine = (scanline == 261);
        bool visibleLine = (scanline < 240);
        bool renderEnable = RENDER_ENABLE(mask);

        // With rendering enabled, each odd PPU frame is one PPU clock shorter than normal.
        // This is done by skipping the first idle tick on the first visible scanline.
        // (by jumping directly from (339,261) on the pre-render scanline to (0,0) on the first
        // visible scanline and doing the last cycle of the last dummy nametable fetch there instead;
        // see this diagram).
        if (ppu->oddFrame && renderEnable) {
            bool skipLastIdle = preLine && cycle == 339;
            bool skipFirstIdle = scanline == 0 && cycle == 0;
            if (skipFirstIdle) {
                cycle = 1;
            } else if (skipLastIdle) {
                cycle = 341;
            }
        }

        if (renderEnable && (visibleLine || preLine) && cycle > 0) {
            // Cycles 1-64:
            // Secondary OAM (32-byte buffer for current sprites on scanline) is initialized to $FF,
            // attempting to read $2004 will return $FF. Internally, the clear operation is implemented
            // by reading from the OAM and writing into the secondary OAM as usual, only a signal is active that
            // makes the read always return $FF.
            if (cycle < 0x41 && (cycle % 2 == 0)) {
                uint16 idx = cycle / 2;
                secondaryOam[idx - 1] = 0xFF;
            }
            //Sprite evaluation
            if (cycle == 0x41) {
                uint8 count = 0;
                uint8 spriteSize = ((control >> 6) & 0x01) * 8;
                for (uint8 i = 0; i < 0x40; ++i) {
                    uint8 offset = i * 4;
                    uint8 y = oam[offset];
                    uint8 df = scanline - y;
                    if (df < 0 && df >= spriteSize) {
                        continue;
                    }
                    if (count < 8) {
                        uint8 idx = count * 4;
                        secondaryOam[idx] = oam[offset];
                        secondaryOam[idx + 1] = oam[offset + 1];
                        secondaryOam[idx + 2] = oam[offset + 2];
                        secondaryOam[idx + 3] = oam[offset + 3];
                    }
                    count = count + 1;
                }
                if (count > 8) {
                    ppu->status = (ppu->status & 0x20);
                }
            }
            bool fetchCycle = cycle < 257;
            bool preFetchCycle = (cycle > 320 && cycle < 337);
            if (fetchCycle || preFetchCycle) {
                uint8 mod = cycle % 8;
                switch (mod) {
                    case 0:
                        ppu_background_mixer(ppu, cycle, preFetchCycle);
                        break;
                    case 1:
                        ppu_read_tile_idx(ppu, mirroring);
                        break;
                    case 3:
                        ppu_read_tile_attr(ppu, mirroring);
                        break;
                    case 5:
                        ppu_read_tile(console, ppu, False);
                        break;
                    case 7:
                        ppu_read_tile(console, ppu, True);
                        break;
                    default:
                }
            }
            if (cycle == 256) {
                // If rendering is enabled, the PPU increments the vertical position in v. The effective Y scroll
                // coordinate is incremented, which is a complex operation that will correctly skip the attribute
                // table memory regions, and wrap to the next nametable appropriately. See Wrapping around below.
                ppu_vy_inc(ppu);
            } else if (cycle == 257) {
                // If rendering is enabled, the PPU copies all bits related to horizontal position from t to v:
                // v: ....A.. ...BCDEF <- t: ....A.. ...BCDEF
                ppu->v = (ppu->v & 0xFBE0) | (ppu->t & 0x041F);
            } else if (preLine && cycle >= 280 && cycle <= 304) {
                //
                // the PPU will repeatedly copy the vertical bits from t to v from dots 280 to 304,
                // completing the full initialization of v from t:
                //
                // v: GHIA.BC DEF..... <- t: GHIA.BC DEF.....
                //
                ppu->v = (ppu->v & 0x841F) | (ppu->t & 0x7BE0);
            }

            //Render sprite
            if (cycle > 256 && cycle < 321) {

            }
        }
        //Clear sprite overflow、sprite 0 hit and vertical blank
        if (preLine && cycle == 1) {
            ppu->status &= 0x1F;
        }
        if (scanline == 241 && cycle == 1) {
            //Set Vertical blank
            ppu->status |= 0x80;
            //If vbl enable and request cpu nmi
            if (VBL_ENABLE(ppu)) {
                ines_console_irq(console->cpu, NMI);
            }
            InesGameCallback callback = console->callback;
            if (callback != NULL) {
                callback(ppu->pixelBuffer);
            }
        }
    }
}


extern uint8 ines_ppu_read(NesConsole *console) {
    PPU *ppu = console->ppu;
    uint16 addr = ppu->v % 0x4000;
    uint8 b = ppu->readBuf;
    if (addr < 0x2000) {
        ppu->readBuf = console->cartridge->ines_chr_read(console, addr);
    } else if (addr < 0x3f00) {
        ppu->readBuf = ppu_vram_mirror(console->mirroring, addr);
    } else {
        b = ppu->palette[ppu_palette_mirror(addr)];
    }
    ppu_vram_increment(ppu);
    return b;
}

extern void ines_ppu_write(NesConsole *console, uint8 b) {
    PPU *ppu = console->ppu;
    uint16 addr = ppu->v % 0x4000;

    if (addr < 0x2000) {
        console->cartridge->ines_chr_write(console, addr, b);
    } else if (addr < 0x3f00) {
        addr = ppu_vram_mirror(console->mirroring, addr);
        ppu->vram[addr] = b;
    } else if (addr < 0x3f20) {
        ppu->palette[ppu_palette_mirror(addr)] = b;
    }

    ppu_vram_increment(ppu);
}


extern uint8 ines_ppu_oam_read(NesConsole *console) {
    PPU *ppu = console->ppu;
    uint16 cycle = ppu->cycle;
    uint16 scanline = ppu->scanline;
    //IF secondary oam is filling,return $FF Directly
    if (scanline < 240 && cycle <= 0x40) {
        return 0xFF;
    }
    uint8 *oam = ppu->primaryOam;
    return oam[ppu->oamAddr];
}

extern void ines_ppu_oam_write(NesConsole *console, uint8 b) {
    PPU *ppu = console->ppu;
    uint8 addr = ppu->oamAddr;
    uint8 *oam = ppu->primaryOam;
    oam[addr] = b;

    ppu->oamAddr = addr + 1;
}